In [ ]:
from planet4.dbscan import DBScanner
from planet4 import io, clustering, plotting, markings, dbscan
import seaborn as sns
sns.set_context('notebook')
blotchcols = markings.Blotch.to_average
fancols = markings.Fan.to_average
In [ ]:
# not automatically initialized
%matplotlib ipympl
In [ ]:
import socket
if socket.gethostname().startswith('macd2860'):
%config InlineBackend.figure_format = 'retina'
In [ ]:
%config InlineBackend.figure_format = 'png'
In [ ]:
from nbtools.logging import setup_live_logging
import logging
logger = setup_live_logging('planet4.dbscan', logging.DEBUG)
In [ ]:
def get_gold_ids(person):
"""Get gold data
Pararemeters
------------
person : {"GP", "MES", "KMA", "common_gold_data"}
Returns
-------
pd.Series
"""
path = Path("/Users/klay6683/Dropbox/Documents/latex_docs/p4_paper1/gold_data")
return pd.read_csv(path / f"{person}.txt", header=None, squeeze=True)
In [ ]:
ids = get_gold_ids('common_gold_data')
In [ ]:
ids = 'br5 bu5 ek1 pbr 1dt 1dr 1fe dch bvc 1c5 1ab 1dk 18s 1b0 1cl 1ct 1at 1al 1aa 10p 185 139 13t 15k 17a'.split()
In [ ]:
def create_and_save_randoms():
myids = np.random.choice(ids, 100)
np.save('myids.npy', myids)
myids = np.load('myids.npy')
len(myids)
In [ ]:
combined = list(ids) + list(myids)
In [ ]:
%store combined
In [ ]:
db = DBScanner(savedir='gold_with_angle_std', do_large_run=True)
In [ ]:
for id_ in ids:
print(id_)
db.cluster_image_id(id_)
In [ ]:
bucket = []
for img_id in ids:
p4id = markings.ImageID(img_id, scope='planet4', data=db.data)
db.pm.obsid = p4id.image_name
db.pm.id = img_id
try:
bucket.extend(db.pm.fandf.angle_std.values)
except FileNotFoundError:
continue
In [ ]:
len(bucket)
In [ ]:
bucket = np.array(bucket)
In [ ]:
import seaborn as sns
In [ ]:
sns.set_context('paper')
In [ ]:
bins = np.arange(0, 22, 1)
In [ ]:
pd.Series(bucket).to_csv("angle_std_bucket.csv", index=False)
In [ ]:
fig, ax = plt.subplots(constrained_layout=True)
sns.distplot(bucket, kde=False, bins=bins)
ax.set_title("Histogram of angular STD for merged fan clusters")
ax.set_xlabel("Fan angle standard deviation per cluster [deg]")
ax.set_ylabel("Histogram Counts")
In [ ]:
db.pm.fanfile
In [ ]:
db.pm.fandf.angle_std
In [ ]:
np.save('combined_ids_to_check.npy', np.array(combined))
In [ ]:
from nbtools import execute_in_parallel
In [ ]:
def process_id(id_):
from planet4.dbscan import DBScanner
db = DBScanner(savedir='newest_clustering_review', do_large_run=True)
for kind in ['fan', 'blotch']:
db.parameter_scan(id_, kind,
msf_vals_to_scan=[0.1, 0.13],
eps_vals_to_scan=[20, 25, 30],
size_to_scan='large')
Here's my comments from the review"
APF0000br5 - seems like the big blotch should have been seen
APF0000bu5 - seems like middle fan should be there - seems too strict a cut not clustering issue?
APF0000ek1- yellow final blotch comes out of no where
APF0000pbr - bottom right blotch seems like it should have survived
APF00001dt - cyan fan seems bigger than it should be
In [ ]:
results = execute_in_parallel(process_id, combined)
In [ ]:
for id_ in ids:
print(id_)
for kind in ['blotch']:
print(kind)
dbscanner = DBScanner(savedir='do_cluster_on_large', do_large_run=True)
# dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
# for blotch:
dbscanner.cluster_and_plot(id_, kind, saveplot=True)
plt.close('all')
In [ ]:
for id_ in ithaca_sample:
print(id_)
for kind in ['blotch']:
print(kind)
dbscanner = DBScanner(id_)
# dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
# for blotch:
dbscanner.parameter_scan(kind, [0.1, 0.13], [15, 22, 30])
plt.close('all')
In [ ]:
for id_ in ithaca_sample:
print(id_)
for kind in ['fan']:
print(kind)
dbscanner = DBScanner(id_)
dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
# for blotch:
# dbscanner.parameter_scan(kind, [0.1, 0.13], [15, 22, 30])
plt.close('all')
In [ ]:
from shapely.geometry import Point
p1 = Point(266.4, 470.56)
p2 = Point(262.072, 469.679)
p1.distance(p2)
In [ ]:
%matplotlib ipympl
In [ ]:
from planet4.catalog_production import ReleaseManager
In [ ]:
rm = ReleaseManager('v1.0')
rm.savefolder
In [ ]:
db = DBScanner(savedir='examples_for_paper', do_large_run=True)
In [ ]:
db.eps_values
In [ ]:
db.cluster_and_plot('arp', 'fan')
In [ ]:
plotting.plot_image_id_pipeline('gr0', datapath='gold_per_obsid', via_obsid=True)
In [ ]:
plt.close('all')
In [ ]:
id_ = ids[14]
In [ ]:
db.parameter_scan(id_, 'fan', msf_vals_to_scan=(0.1, 0.13),
eps_vals_to_scan=(10, 20, 30), size_to_scan='small')
In [ ]:
plotting.plot_image_id_pipeline(id_, datapath=rm.savefolder, save=True, saveroot='./plots')
In [ ]:
data = io.DBManager().get_image_id_markings('arp')
In [ ]:
data.classification_id.nunique()
In [ ]:
data.groupby(['classification_id', 'user_name']).marking.value_counts()
In [ ]:
data[data.marking=='blotch'].shape
In [ ]:
db.parameter_scan('bsn', 'blotch', [0.10, 0.13], [10, 12, 14], size_to_scan='small', )
In [ ]:
v1 = (8.9, 87.3)
v2 = (19.8, 79.8)
In [ ]:
v1 = np.array(v1)
v2 = np.array(v2)
In [ ]:
from numpy.linalg import norm
In [ ]:
norm(v1 - v2)
In [ ]:
norm(np.array(v1), np.array(v2))
In [ ]:
db.save_results
In [ ]:
db.final_clusters['blotch']
In [ ]:
import seaborn as sns
sns.set_context('notebook')
In [ ]:
import itertools
palette = itertools.cycle(sns.color_palette('bright'))
fig, ax = plt.subplots()
for b in db.final_clusters['blotch'][1]:
db.p4id.plot_blotches(data=b, user_color=next(palette), ax=ax)
ax.set_title('second round')
fig.savefig('second_round.png', dpi=150)
In [ ]:
db.parameter_scan('1wg', 'fan',
msf_vals_to_scan=[0.1, 0.13],
eps_vals_to_scan=[20, 25, 30],
size_to_scan='large')
In [ ]:
db.parameter_scan('15k', 'blotch',
msf_vals_to_scan=[0.1, 0.13],
eps_vals_to_scan=[10, 12, 15],
size_to_scan='small')
In [ ]:
fig, ax = plt.subplots()
db.p4id.plot_blotches(ax=ax)
ax.set_title('input data')
fig.savefig('input_data.png', dpi=150)
In [ ]:
blotches = db.p4id.filter_data('blotch').dropna(how='all', axis=1)
In [ ]:
blotches['x y radius_1 radius_2 angle'.split()].sort_values(by='radius_1')
In [ ]:
fans = db.p4id.filter_data('fan')
In [ ]:
xyclusters = pd.concat(db.cluster_xy(blotches, 15)).dropna(how='all', axis=1)
In [ ]:
blotches.shape
In [ ]:
xyclusters.shape
In [ ]:
blotches[~blotches.isin(xyclusters).all(1)].shape
In [ ]:
In [ ]:
In [ ]:
In [ ]:
db.eps_values['blotch']['angle']= None
In [ ]:
db.eps_values['blotch']['angle']= 20
In [ ]:
db.eps_values['blotch']['radius']['small']=30
In [ ]:
db.eps_values
In [ ]:
db.parameter_scan('bp7', 'blotch', [0.1, 0.13], [15,22,30], 'small')
In [ ]:
db.cluster_image_id('bz7')
In [ ]:
db.cluster_and_plot('bz7', 'blotch')
In [ ]:
db.min_samples
In [ ]:
db.cluster_image_id('bb6')
In [ ]:
db.final_clusters['blotch'][0][4][markings.Blotch.to_average+['user_name']]
In [ ]:
db.final_clusters['blotch'][0][2][markings.Blotch.to_average+['user_name']]
In [ ]:
In [ ]:
%debug
In [ ]:
db.parameter_scan('blotch', [0.1, 0.13], [15, 22, 30])
In [ ]:
db.parameter_scan('fan', [0.1,0.15], [30, 50,70])
In [ ]:
db.pipeline(10, 3, 50)
In [ ]:
db.store_folder
In [ ]:
sizes = []
for _,b in blotches.iterrows():
B = markings.Blotch(b, scope='planet4')
sizes.append(B.area)
In [ ]:
%matplotlib nbagg
In [ ]:
plt.figure()
plt.hist(sizes, bins=50);
In [ ]:
db.parameter_scan('fan', [0.1,0.15], [10, 15, 20])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
db.cluster_and_plot('blotch', 20, 3)
ax = plt.gca()
ax.get_title()
In [ ]:
In [ ]:
db.parameter_scan('fan', [0.07, 0.1, 0.15], [15,20])
In [ ]:
db.parameter_scan('blotch', [0.07, 0.1, 0.15], [15,20])
In [ ]:
ek1.cluster_and_plot('blotch', 20, 3)
In [ ]:
ek1.p4id.plot_blotches(data=ek1.finalclusters[5])
In [ ]:
ek1.p4id.plot_blotches(data=ek1.averaged[5])
In [ ]:
In [ ]:
In [ ]:
p4id = markings.ImageID('1fe', scope='planet4')
blotches = p4id.get_blotches()
In [ ]:
X = blotches['x y'.split()]
In [ ]:
dbscanner = DBScanner(X, min_samples=5, eps=20)
In [ ]:
clusters = [blotches.loc[idx] for idx in dbscanner.clustered_indices]
In [ ]:
from planet4.clustering import cluster_angles
In [ ]:
bucket = []
for cluster in clusters:
print(cluster.shape)
bucket.append([cluster.loc[idx] for idx in cluster_angles(cluster, 'blotch', 5)])
In [ ]:
for item in bucket:
for subitem in item:
print(subitem.shape)
In [ ]:
cluster_and_plot('1dr', production=True, dynamic=True,
msf=msf, eps=eps, radii=False, dbscan=True,
figtitle=figtitle)
In [ ]:
In [ ]:
cm = cluster_and_plot('1dt', production=False, msf=0.1, dynamic=True,
radii=False, dbscan=False)
In [ ]:
df = pd.read_csv('fuckdf.csv')
In [ ]:
(df - df.mean(axis=0))/df.std(axis=0)
In [ ]:
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=0)]
In [ ]:
from scipy.stats import zscore
In [ ]:
zscore??
In [ ]:
pd.DataFrame(zscore(df,ddof=1))
In [ ]:
def highlight_bigger_std(x):
'''
highlight the maximum in a Series yellow.
'''
is_true = (np.abs(x - x.mean()) / x.std() > 2)
return ['background-color: yellow' if v else '' for v in is_true]
# return is_true
In [ ]:
df.style.apply(highlight_bigger_std)
In [ ]:
In [ ]:
In [ ]:
cm = cluster_and_plot('pbr', production=False, msf=0.1, dynamic=True,
radii=False)
In [ ]:
cm = cluster_and_plot('pbr',eps=20, production=False, msf=0.1, dynamic=True,
radii=True)
In [ ]:
cm.db
In [ ]:
imgid = '1at'
imgid = 'dch'
imgid = 'bvc'
imgid = '1dr'
imgid = '1fe'
imgid = 'br5'
imgid = 'ek1'
p4id = markings.ImageID(imgid, scope='planet4')
In [ ]:
data = p4id.get_blotches()
In [ ]:
from planet4.dbscan import DBScanner
In [ ]:
current_X = data[['x','y']].values
In [ ]:
clusterer = DBScanner(current_X, eps=15, min_samples=3)
In [ ]:
clusterer.n_clusters_
In [ ]:
cluster = data.loc[clusterer.clustered_indices[0]]
p4id.plot_blotches(blotches=cluster,with_center=True)
In [ ]:
cluster[blotchcols]
In [ ]:
indices = clustering.cluster_angles(cluster, 'blotch', eps_blotchangle=10)
indices
In [ ]:
angle_cluster_data = cluster.loc[indices[0], blotchcols +['user_name']]
In [ ]:
angle_cluster_data
In [ ]:
df = angle_cluster_data[blotchcols]
In [ ]:
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=1)]
In [ ]:
clustering.get_average_object(angle_cluster_data[blotchcols], 'blotch')
In [ ]:
p4id.plot_blotches(blotches=cluster.loc[indices[0]], with_center=True)
In [ ]:
df = cluster.loc[indices[0]][blotchcols]
In [ ]:
df['area'] = df.apply(lambda x: np.pi*x.radius_1*x.radius_2, axis=1)
In [ ]:
df
In [ ]:
col='radius_1'
In [ ]:
df.radius_1.std()
In [ ]:
df[np.abs(df[col]-df[col].mean())<=(1*df[col].std())]
In [ ]:
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=1)]
In [ ]:
subclus
In [ ]:
testblotch = markings.Blotch?
In [ ]:
testblotchdata = dict(x=340, y=340, angle=127, radius_1=250, radius_2=186)
In [ ]:
testblotch = markings.Blotch(
pd.DataFrame(
testblotchdata, index=[0]), scope='planet4')
fig, ax = plt.subplots()
ax.add_artist(testblotch)
ax.set_xlim(0, 800)
ax.set_ylim(0, 600)
In [ ]:
testblotch = markings.Blotch(
pd.DataFrame(testblotchdata, index=[0]),
scope='planet4')
p4id.plot_blotches(blotches=[testblotch])
In [ ]:
from sklearn.cluster import DBSCAN
class DBScanner(object):
"""Execute clustering and create mean cluster markings.
The instantiated object will execute:
* _run_DBSCAN() to perform the clustering itself
* _post_analysis() to create mean markings from the clustering results
Parameters
----------
current_X : numpy.array
array holding the data to be clustered, preprocessed in ClusterManager
eps : int, optional
Distance criterion for DBSCAN algorithm. Samples further away than this value don't
become members of the currently considered cluster. Default: 10
min_samples : int, optional
Mininum number of samples required for a cluster to be created. Default: 3
"""
def __init__(self, X, eps=15, min_samples=3, only_core=False):
self.X = X
self.eps = eps
self.min_samples = min_samples
self.only_core = only_core
# these lines execute the clustering
self._run_DBSCAN()
def _run_DBSCAN(self):
"""Perform the DBSCAN clustering."""
db = DBSCAN(self.eps, self.min_samples).fit(self.X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
self.n_clusters_ = len(unique_labels) - (1 if -1 in labels else 0)
self.clustered_indices = [] # list of `kind` cluster average objects
self.n_rejected = 0
# loop over unique labels.
for k, col in zip(unique_labels, colors):
# get indices for members of this cluster
class_member_mask = (labels == k)
if self.only_core:
cluster_members = (class_member_mask & core_samples_mask)
else:
cluster_members = class_member_mask
if k == -1:
col = 'black'
self.n_rejected = len(cluster_members)
else:
xy = self.X[cluster_members]
if xy.shape[1] > 1:
y = xy[:, 1]
else:
y = [0] * xy.shape[0]
plt.plot(
xy[:, 0],
y,
'o',
markerfacecolor=col,
markeredgecolor='black',
markersize=14)
xy = self.X[class_member_mask & ~core_samples_mask]
if xy.shape[1] > 1:
y = xy[:, 1]
else:
y = [0] * xy.shape[0]
plt.plot(
xy[:, 0],
y,
'o',
markerfacecolor=col,
markeredgecolor='black',
markersize=6)
self.clustered_indices.append(cluster_members)
plt.gca().invert_yaxis()
plt.title('Estimated number of clusters: %d' % self.n_clusters_)
self.db = db
In [ ]:
cluster[blotchcols]
In [ ]:
xy_angles = clustering.angle_to_xy(cluster.angle, 'blotch')
In [ ]:
xy_angles
In [ ]:
xy_angles.shape
In [ ]:
plt.figure(figsize=(5*1.3,5))
clusterer = DBScanner(xy_angles, eps=20*np.pi/360, min_samples=3)
In [ ]:
data.loc[clusterer.clustered_indices[1]]
In [ ]:
for cluster_members in clusterer.clustered_indices:
clusterdata = data.loc[cluster_members, blotchcols + ['user_name']]
print(len(clusterdata))
angle_clustered = clustering.cluster_angles(clusterdata, 'blotch')
for indices in angle_clustered:
angle_clusterdata = clusterdata.loc[indices, blotchcols +
['user_name']]
filtered = angle_clusterdata.groupby('user_name').first()
print(len(filtered))
In [ ]:
cm.min_samples
In [ ]:
30* cm.min_samples_factor
In [ ]:
cm.reduced_data['blotch']
In [ ]:
cm.cluster_angles
In [ ]:
db = clustering.cluster_angles(cluster, 'blotch')
len(db[0])
In [ ]:
len(cluster)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
filtered = cluster.groupby('user_name').first()
In [ ]:
plt.figure()
filtered.angle.hist()
In [ ]:
In [ ]:
toprint = cluster2[markings.Fan.to_average + ['user_name', 'marking', 'classification_id']]
In [ ]:
toprint.to_clipboard(index=False)
In [ ]:
def add_angle_vector(df):
new = df.copy()
new['xang'] = np.cos(np.deg2rad(df.angle))
new['yang'] = np.sin(np.deg2rad(df.angle))
return new
In [ ]:
cluster2 = add_angle_vector(cluster2)
In [ ]:
cluster2
In [ ]:
def angle_to_xy(angle):
x = np.cos(np.deg2rad(angle))
y = np.sin(np.deg2rad(angle))
return np.vstack([x,y]).T
In [ ]:
def cluster_angles(angles, delta_angle):
dist_per_degree = 0.017453070996747883
X = angle_to_xy(angles)
clusterer = DBScanner(X, eps=delta_angle*dist_per_degree, min_samples=3)
return clusterer
In [ ]:
clusterer = cluster_angles(cluster.angle, 10)
In [ ]:
clusterer.db.core_sample_indices_
In [ ]:
clusterer.db.labels_
In [ ]:
cluster.shape
In [ ]:
clusterer.clustered_indices
In [ ]:
cluster2.iloc[clusterer.clustered_data[0]]
In [ ]:
In [ ]:
dbscanner.reduced_data[0]
this means all ellipses were clustered together. eps=10 picks 3 out of these 6.
In [ ]:
clusterdata = data.iloc[dbscanner.reduced_data[0]]
so clusterdata is just the same as the input data, i just repeat the exact same code steps here for consistency.
In [ ]:
clusterdata[blotchcols]
In [ ]:
In [ ]:
meandata = clusterdata.mean()
meandata
In [ ]:
from scipy.stats import circmean
In [ ]:
meandata.angle = circmean(clusterdata.angle, high=180)
In [ ]:
meandata
In [ ]:
n_class_old = data.classification_id.nunique()
n_class_old
In [ ]:
# number of classifications that include fan and blotches
f1 = data.marking == 'fan'
f2 = data.marking == 'blotch'
n_class_fb = data[f1 | f2].classification_id.nunique()
n_class_fb
In [ ]:
data=data[data.marking=='blotch']
In [ ]:
plotting.plot_raw_blotches('bvc')
In [ ]:
fans.plot(kind='scatter', x='x',y='y')
plt.gca().invert_yaxis()
In [ ]:
fx1 = data.x < 400
fx2 = data.x > 300
fy1 = data.y_R > 300
fy2 = data.y_R < 400
In [ ]:
data = data.reset_index()
In [ ]:
data[fx1 & fx2 & fy1 & fy2].angle
In [ ]:
cm.dbscanner.reduced_data
In [ ]:
dbscanner = dbscan.DBScanner()
In [ ]:
db = io.DBManager()
In [ ]:
data = db.get_obsid_markings('ESP_020568_0950')
In [ ]:
image_ids = data.image_id.unique()
In [ ]:
%matplotlib nbagg
import seaborn as sns
sns.set_context('notebook')
In [ ]:
p4id = markings.ImageID(image_ids[0])
p4id.plot_fans()
In [ ]:
p4id.plot_fans(data=p4id.data.query('angle>180'))
In [ ]:
p4id.imgid
In [ ]:
data[data.marking=='fan'].angle.describe()
In [ ]:
dbscanner.cluster_image_name('PSP_002622_0945')
In [ ]:
db = io.DBManager()
In [ ]:
db.get_image_name_markings('PSP_002622_0945')
In [ ]:
obsids = 'ESP_020476_0950, ESP_011931_0945, ESP_012643_0945, ESP_020783_0950'.split(', ')
In [ ]:
obsids
In [ ]:
def process_obsid(obsid):
from planet4.catalog_production import do_cluster_obsids
do_cluster_obsids(obsid, savedir=obsid)
return obsid
In [ ]:
from nbtools import execute_in_parallel
In [ ]:
execute_in_parallel(process_obsid, obsids)
In [ ]:
db = io.DBManager()
for obsid in obsids:
data = db.get_image_name_markings(obsid)
image_ids = data.image_id.drop_duplicates().sample(n=50)
for id_ in image_ids:
print(id_)
plotting.plot_image_id_pipeline(id_, datapath=obsid, save=True,
saveroot=f'plots/{obsid}',
via_obsid=True)
plt.close('all')
In [ ]:
plotting.plot_finals('prv', datapath=obsids[0], via_obsid=True)
In [ ]: